In [1]:
# for data manipulation and analysis
import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import SparkSession

# for visualisation
import seaborn as sns
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
In [2]:
df = pd.read_csv('/Users/suryanshjamwal/Desktop/Analytics Case studies/Spotify/Spotify_Youtube.csv')
In [3]:
df
Out[3]:
Unnamed: 0 Artist Url_spotify Track Album Album_type Uri Danceability Energy Key ... Url_youtube Title Channel Views Likes Comments Description Licensed official_video Stream
0 0 Gorillaz https://open.spotify.com/artist/3AA28KZvwAUcZu... Feel Good Inc. Demon Days album spotify:track:0d28khcov6AiegSCpG5TuT 0.818 0.705 6.0 ... https://www.youtube.com/watch?v=HyHNuVaZJ-k Gorillaz - Feel Good Inc. (Official Video) Gorillaz 693555221.0 6220896.0 169907.0 Official HD Video for Gorillaz' fantastic trac... True True 1.040235e+09
1 1 Gorillaz https://open.spotify.com/artist/3AA28KZvwAUcZu... Rhinestone Eyes Plastic Beach album spotify:track:1foMv2HQwfQ2vntFf9HFeG 0.676 0.703 8.0 ... https://www.youtube.com/watch?v=yYDmaexVHic Gorillaz - Rhinestone Eyes [Storyboard Film] (... Gorillaz 72011645.0 1079128.0 31003.0 The official video for Gorillaz - Rhinestone E... True True 3.100837e+08
2 2 Gorillaz https://open.spotify.com/artist/3AA28KZvwAUcZu... New Gold (feat. Tame Impala and Bootie Brown) New Gold (feat. Tame Impala and Bootie Brown) single spotify:track:64dLd6rVqDLtkXFYrEUHIU 0.695 0.923 1.0 ... https://www.youtube.com/watch?v=qJa-VFwPpYA Gorillaz - New Gold ft. Tame Impala & Bootie B... Gorillaz 8435055.0 282142.0 7399.0 Gorillaz - New Gold ft. Tame Impala & Bootie B... True True 6.306347e+07
3 3 Gorillaz https://open.spotify.com/artist/3AA28KZvwAUcZu... On Melancholy Hill Plastic Beach album spotify:track:0q6LuUqGLUiCPP1cbdwFs3 0.689 0.739 2.0 ... https://www.youtube.com/watch?v=04mfKJWDSzI Gorillaz - On Melancholy Hill (Official Video) Gorillaz 211754952.0 1788577.0 55229.0 Follow Gorillaz online:\nhttp://gorillaz.com \... True True 4.346636e+08
4 4 Gorillaz https://open.spotify.com/artist/3AA28KZvwAUcZu... Clint Eastwood Gorillaz album spotify:track:7yMiX7n9SBvadzox8T5jzT 0.663 0.694 10.0 ... https://www.youtube.com/watch?v=1V_xRb0x9aw Gorillaz - Clint Eastwood (Official Video) Gorillaz 618480958.0 6197318.0 155930.0 The official music video for Gorillaz - Clint ... True True 6.172597e+08
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20713 20713 SICK LEGEND https://open.spotify.com/artist/3EYY5FwDkHEYLw... JUST DANCE HARDSTYLE JUST DANCE HARDSTYLE single spotify:track:0RtcKQGyI4hr8FgFH1TuYG 0.582 0.926 5.0 ... https://www.youtube.com/watch?v=5SHmKFKlNqI JUST DANCE HARDSTYLE SICK LEGEND - Topic 71678.0 1113.0 0.0 Provided to YouTube by Routenote\n\nJUST DANCE... True True 9.227144e+06
20714 20714 SICK LEGEND https://open.spotify.com/artist/3EYY5FwDkHEYLw... SET FIRE TO THE RAIN HARDSTYLE SET FIRE TO THE RAIN HARDSTYLE single spotify:track:3rHvPA8lUnPBkaLyPOc0VV 0.531 0.936 4.0 ... https://www.youtube.com/watch?v=ocTH6KxllDQ SET FIRE TO THE RAIN HARDSTYLE SICK LEGEND - Topic 164741.0 2019.0 0.0 Provided to YouTube by Routenote\n\nSET FIRE T... True True 1.089818e+07
20715 20715 SICK LEGEND https://open.spotify.com/artist/3EYY5FwDkHEYLw... OUTSIDE HARDSTYLE SPED UP OUTSIDE HARDSTYLE SPED UP single spotify:track:4jk00YxPtPbhvHJE9N4ddv 0.443 0.830 4.0 ... https://www.youtube.com/watch?v=5wFhE-HY0hg OUTSIDE HARDSTYLE SPED UP SICK LEGEND - Topic 35646.0 329.0 0.0 Provided to YouTube by Routenote\n\nOUTSIDE HA... True True 6.226110e+06
20716 20716 SICK LEGEND https://open.spotify.com/artist/3EYY5FwDkHEYLw... ONLY GIRL HARDSTYLE ONLY GIRL HARDSTYLE single spotify:track:5EyErbpsugWliX006eTDex 0.417 0.767 9.0 ... https://www.youtube.com/watch?v=VMFLbFRNCn0 ONLY GIRL HARDSTYLE SICK LEGEND - Topic 6533.0 88.0 0.0 Provided to YouTube by Routenote\n\nONLY GIRL ... True True 6.873961e+06
20717 20717 SICK LEGEND https://open.spotify.com/artist/3EYY5FwDkHEYLw... MISS YOU HARDSTYLE MISS YOU HARDSTYLE single spotify:track:6lOn0jz1QpjcWeXo1oMm0k 0.498 0.938 6.0 ... https://www.youtube.com/watch?v=zau0dckCFi0 MISS YOU HARDSTYLE SICK LEGEND - Topic 158697.0 2484.0 0.0 Provided to YouTube by Routenote\n\nMISS YOU H... True True 5.695584e+06

20718 rows × 28 columns

In [4]:
df.head()
Out[4]:
Unnamed: 0 Artist Url_spotify Track Album Album_type Uri Danceability Energy Key ... Url_youtube Title Channel Views Likes Comments Description Licensed official_video Stream
0 0 Gorillaz https://open.spotify.com/artist/3AA28KZvwAUcZu... Feel Good Inc. Demon Days album spotify:track:0d28khcov6AiegSCpG5TuT 0.818 0.705 6.0 ... https://www.youtube.com/watch?v=HyHNuVaZJ-k Gorillaz - Feel Good Inc. (Official Video) Gorillaz 693555221.0 6220896.0 169907.0 Official HD Video for Gorillaz' fantastic trac... True True 1.040235e+09
1 1 Gorillaz https://open.spotify.com/artist/3AA28KZvwAUcZu... Rhinestone Eyes Plastic Beach album spotify:track:1foMv2HQwfQ2vntFf9HFeG 0.676 0.703 8.0 ... https://www.youtube.com/watch?v=yYDmaexVHic Gorillaz - Rhinestone Eyes [Storyboard Film] (... Gorillaz 72011645.0 1079128.0 31003.0 The official video for Gorillaz - Rhinestone E... True True 3.100837e+08
2 2 Gorillaz https://open.spotify.com/artist/3AA28KZvwAUcZu... New Gold (feat. Tame Impala and Bootie Brown) New Gold (feat. Tame Impala and Bootie Brown) single spotify:track:64dLd6rVqDLtkXFYrEUHIU 0.695 0.923 1.0 ... https://www.youtube.com/watch?v=qJa-VFwPpYA Gorillaz - New Gold ft. Tame Impala & Bootie B... Gorillaz 8435055.0 282142.0 7399.0 Gorillaz - New Gold ft. Tame Impala & Bootie B... True True 6.306347e+07
3 3 Gorillaz https://open.spotify.com/artist/3AA28KZvwAUcZu... On Melancholy Hill Plastic Beach album spotify:track:0q6LuUqGLUiCPP1cbdwFs3 0.689 0.739 2.0 ... https://www.youtube.com/watch?v=04mfKJWDSzI Gorillaz - On Melancholy Hill (Official Video) Gorillaz 211754952.0 1788577.0 55229.0 Follow Gorillaz online:\nhttp://gorillaz.com \... True True 4.346636e+08
4 4 Gorillaz https://open.spotify.com/artist/3AA28KZvwAUcZu... Clint Eastwood Gorillaz album spotify:track:7yMiX7n9SBvadzox8T5jzT 0.663 0.694 10.0 ... https://www.youtube.com/watch?v=1V_xRb0x9aw Gorillaz - Clint Eastwood (Official Video) Gorillaz 618480958.0 6197318.0 155930.0 The official music video for Gorillaz - Clint ... True True 6.172597e+08

5 rows × 28 columns

In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20718 entries, 0 to 20717
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        20718 non-null  int64  
 1   Artist            20718 non-null  object 
 2   Url_spotify       20718 non-null  object 
 3   Track             20718 non-null  object 
 4   Album             20718 non-null  object 
 5   Album_type        20718 non-null  object 
 6   Uri               20718 non-null  object 
 7   Danceability      20716 non-null  float64
 8   Energy            20716 non-null  float64
 9   Key               20716 non-null  float64
 10  Loudness          20716 non-null  float64
 11  Speechiness       20716 non-null  float64
 12  Acousticness      20716 non-null  float64
 13  Instrumentalness  20716 non-null  float64
 14  Liveness          20716 non-null  float64
 15  Valence           20716 non-null  float64
 16  Tempo             20716 non-null  float64
 17  Duration_ms       20716 non-null  float64
 18  Url_youtube       20248 non-null  object 
 19  Title             20248 non-null  object 
 20  Channel           20248 non-null  object 
 21  Views             20248 non-null  float64
 22  Likes             20177 non-null  float64
 23  Comments          20149 non-null  float64
 24  Description       19842 non-null  object 
 25  Licensed          20248 non-null  object 
 26  official_video    20248 non-null  object 
 27  Stream            20142 non-null  float64
dtypes: float64(15), int64(1), object(12)
memory usage: 4.4+ MB

Data Manipulation and Cleaning¶

Checking for redundancy and null values.¶

In [6]:
df = df.drop(['Unnamed: 0', 'Title', 'Url_spotify', 'Uri', 'Description', 'official_video', 'Channel', 'Url_youtube'], axis = 1)
In [7]:
df.columns
Out[7]:
Index(['Artist', 'Track', 'Album', 'Album_type', 'Danceability', 'Energy',
       'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Liveness', 'Valence', 'Tempo', 'Duration_ms', 'Views', 'Likes',
       'Comments', 'Licensed', 'Stream'],
      dtype='object')
In [8]:
df = df.dropna()
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 19549 entries, 0 to 20717
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Artist            19549 non-null  object 
 1   Track             19549 non-null  object 
 2   Album             19549 non-null  object 
 3   Album_type        19549 non-null  object 
 4   Danceability      19549 non-null  float64
 5   Energy            19549 non-null  float64
 6   Key               19549 non-null  float64
 7   Loudness          19549 non-null  float64
 8   Speechiness       19549 non-null  float64
 9   Acousticness      19549 non-null  float64
 10  Instrumentalness  19549 non-null  float64
 11  Liveness          19549 non-null  float64
 12  Valence           19549 non-null  float64
 13  Tempo             19549 non-null  float64
 14  Duration_ms       19549 non-null  float64
 15  Views             19549 non-null  float64
 16  Likes             19549 non-null  float64
 17  Comments          19549 non-null  float64
 18  Licensed          19549 non-null  object 
 19  Stream            19549 non-null  float64
dtypes: float64(15), object(5)
memory usage: 3.1+ MB
In [10]:
df.describe()
Out[10]:
Danceability Energy Key Loudness Speechiness Acousticness Instrumentalness Liveness Valence Tempo Duration_ms Views Likes Comments Stream
count 19549.000000 19549.000000 19549.000000 19549.000000 19549.000000 19549.000000 19549.000000 19549.000000 19549.000000 19549.000000 1.954900e+04 1.954900e+04 1.954900e+04 1.954900e+04 1.954900e+04
mean 0.621059 0.635170 5.294337 -7.633179 0.095392 0.289106 0.055292 0.191226 0.528950 120.605702 2.246281e+05 9.545626e+07 6.700487e+05 2.786371e+04 1.371101e+08
std 0.165489 0.213555 3.579338 4.618839 0.106243 0.285908 0.192519 0.165197 0.245228 29.619340 1.269126e+05 2.775744e+08 1.805054e+06 1.959074e+05 2.463589e+08
min 0.000000 0.000020 0.000000 -46.251000 0.000000 0.000001 0.000000 0.014500 0.000000 0.000000 3.098500e+04 2.600000e+01 0.000000e+00 0.000000e+00 6.574000e+03
25% 0.519000 0.508000 2.000000 -8.772000 0.035700 0.044400 0.000000 0.094000 0.339000 96.990000 1.802400e+05 1.911528e+06 2.238000e+04 5.310000e+02 1.781089e+07
50% 0.639000 0.666000 5.000000 -6.516000 0.050700 0.190000 0.000002 0.125000 0.536000 119.964000 2.132530e+05 1.491440e+07 1.279090e+05 3.343000e+03 4.979139e+07
75% 0.742000 0.797000 8.000000 -4.929000 0.104000 0.470000 0.000433 0.234000 0.725000 139.951000 2.519200e+05 7.152989e+07 5.266400e+05 1.449300e+04 1.390828e+08
max 0.975000 1.000000 11.000000 0.920000 0.964000 0.996000 1.000000 1.000000 0.993000 243.372000 4.676058e+06 8.079649e+09 5.078865e+07 1.608314e+07 3.386520e+09
In [11]:
df = df.drop_duplicates(['Track'])
In [12]:
df
Out[12]:
Artist Track Album Album_type Danceability Energy Key Loudness Speechiness Acousticness Instrumentalness Liveness Valence Tempo Duration_ms Views Likes Comments Licensed Stream
0 Gorillaz Feel Good Inc. Demon Days album 0.818 0.705 6.0 -6.679 0.1770 0.008360 0.002330 0.6130 0.7720 138.559 222640.0 693555221.0 6220896.0 169907.0 True 1.040235e+09
1 Gorillaz Rhinestone Eyes Plastic Beach album 0.676 0.703 8.0 -5.815 0.0302 0.086900 0.000687 0.0463 0.8520 92.761 200173.0 72011645.0 1079128.0 31003.0 True 3.100837e+08
2 Gorillaz New Gold (feat. Tame Impala and Bootie Brown) New Gold (feat. Tame Impala and Bootie Brown) single 0.695 0.923 1.0 -3.930 0.0522 0.042500 0.046900 0.1160 0.5510 108.014 215150.0 8435055.0 282142.0 7399.0 True 6.306347e+07
3 Gorillaz On Melancholy Hill Plastic Beach album 0.689 0.739 2.0 -5.810 0.0260 0.000015 0.509000 0.0640 0.5780 120.423 233867.0 211754952.0 1788577.0 55229.0 True 4.346636e+08
4 Gorillaz Clint Eastwood Gorillaz album 0.663 0.694 10.0 -8.627 0.1710 0.025300 0.000000 0.0698 0.5250 167.953 340920.0 618480958.0 6197318.0 155930.0 True 6.172597e+08
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20713 SICK LEGEND JUST DANCE HARDSTYLE JUST DANCE HARDSTYLE single 0.582 0.926 5.0 -6.344 0.0328 0.448000 0.000000 0.0839 0.6580 90.002 94667.0 71678.0 1113.0 0.0 True 9.227144e+06
20714 SICK LEGEND SET FIRE TO THE RAIN HARDSTYLE SET FIRE TO THE RAIN HARDSTYLE single 0.531 0.936 4.0 -1.786 0.1370 0.028000 0.000000 0.0923 0.6570 174.869 150857.0 164741.0 2019.0 0.0 True 1.089818e+07
20715 SICK LEGEND OUTSIDE HARDSTYLE SPED UP OUTSIDE HARDSTYLE SPED UP single 0.443 0.830 4.0 -4.679 0.0647 0.024300 0.000000 0.1540 0.4190 168.388 136842.0 35646.0 329.0 0.0 True 6.226110e+06
20716 SICK LEGEND ONLY GIRL HARDSTYLE ONLY GIRL HARDSTYLE single 0.417 0.767 9.0 -4.004 0.4190 0.356000 0.018400 0.1080 0.5390 155.378 108387.0 6533.0 88.0 0.0 True 6.873961e+06
20717 SICK LEGEND MISS YOU HARDSTYLE MISS YOU HARDSTYLE single 0.498 0.938 6.0 -4.543 0.1070 0.002770 0.911000 0.1360 0.0787 160.067 181500.0 158697.0 2484.0 0.0 True 5.695584e+06

16866 rows × 20 columns

Exploratory Data Analysis¶

In [13]:
numeric_df = df.select_dtypes(include=np.number)
In [14]:
numeric_df
Out[14]:
Danceability Energy Key Loudness Speechiness Acousticness Instrumentalness Liveness Valence Tempo Duration_ms Views Likes Comments Stream
0 0.818 0.705 6.0 -6.679 0.1770 0.008360 0.002330 0.6130 0.7720 138.559 222640.0 693555221.0 6220896.0 169907.0 1.040235e+09
1 0.676 0.703 8.0 -5.815 0.0302 0.086900 0.000687 0.0463 0.8520 92.761 200173.0 72011645.0 1079128.0 31003.0 3.100837e+08
2 0.695 0.923 1.0 -3.930 0.0522 0.042500 0.046900 0.1160 0.5510 108.014 215150.0 8435055.0 282142.0 7399.0 6.306347e+07
3 0.689 0.739 2.0 -5.810 0.0260 0.000015 0.509000 0.0640 0.5780 120.423 233867.0 211754952.0 1788577.0 55229.0 4.346636e+08
4 0.663 0.694 10.0 -8.627 0.1710 0.025300 0.000000 0.0698 0.5250 167.953 340920.0 618480958.0 6197318.0 155930.0 6.172597e+08
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20713 0.582 0.926 5.0 -6.344 0.0328 0.448000 0.000000 0.0839 0.6580 90.002 94667.0 71678.0 1113.0 0.0 9.227144e+06
20714 0.531 0.936 4.0 -1.786 0.1370 0.028000 0.000000 0.0923 0.6570 174.869 150857.0 164741.0 2019.0 0.0 1.089818e+07
20715 0.443 0.830 4.0 -4.679 0.0647 0.024300 0.000000 0.1540 0.4190 168.388 136842.0 35646.0 329.0 0.0 6.226110e+06
20716 0.417 0.767 9.0 -4.004 0.4190 0.356000 0.018400 0.1080 0.5390 155.378 108387.0 6533.0 88.0 0.0 6.873961e+06
20717 0.498 0.938 6.0 -4.543 0.1070 0.002770 0.911000 0.1360 0.0787 160.067 181500.0 158697.0 2484.0 0.0 5.695584e+06

16866 rows × 15 columns

In [15]:
numeric_df.describe()
Out[15]:
Danceability Energy Key Loudness Speechiness Acousticness Instrumentalness Liveness Valence Tempo Duration_ms Views Likes Comments Stream
count 16866.000000 16866.000000 16866.000000 16866.000000 16866.000000 16866.000000 16866.000000 16866.000000 16866.000000 16866.000000 1.686600e+04 1.686600e+04 1.686600e+04 1.686600e+04 1.686600e+04
mean 0.619050 0.635350 5.279912 -7.677653 0.094880 0.287854 0.056587 0.191661 0.530527 120.676194 2.226875e+05 8.777338e+07 6.102794e+05 2.543879e+04 1.300308e+08
std 0.165577 0.215155 3.574273 4.637705 0.107337 0.287661 0.193770 0.165636 0.246654 29.685312 9.287362e+04 2.562711e+08 1.644068e+06 1.517489e+05 2.330323e+08
min 0.000000 0.000020 0.000000 -46.251000 0.000000 0.000001 0.000000 0.014500 0.000000 0.000000 3.098500e+04 2.600000e+01 0.000000e+00 0.000000e+00 6.574000e+03
25% 0.515000 0.505000 2.000000 -8.874750 0.035400 0.041500 0.000000 0.094000 0.339000 96.963750 1.808240e+05 1.818648e+06 2.134500e+04 5.100000e+02 1.695603e+07
50% 0.636000 0.666000 5.000000 -6.551500 0.050200 0.186000 0.000003 0.124000 0.538000 119.969000 2.134770e+05 1.381934e+07 1.173160e+05 3.137000e+03 4.813189e+07
75% 0.741000 0.800000 8.000000 -4.946000 0.103000 0.472000 0.000517 0.236000 0.729000 139.982000 2.522248e+05 6.650796e+07 4.807730e+05 1.351250e+04 1.328671e+08
max 0.975000 1.000000 11.000000 0.920000 0.964000 0.996000 1.000000 1.000000 0.993000 243.372000 4.676058e+06 8.079647e+09 5.078863e+07 9.131761e+06 3.386520e+09

Checking Correlation between features¶

In [16]:
plt.figure(figsize=(16, 10))
heatmap = sns.heatmap(numeric_df.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':19}, pad=12);
plt.savefig('heatmap.png', dpi=550, bbox_inches='tight')

The correlation matrix portrays limited relationship between different characteristics of a soundtrack.¶

We shall further examine the dataset to explore it's integrity.¶

In [17]:
# Using spark.sql to query dataframe

spark = SparkSession.builder.appName(
  "pyspark.sql").getOrCreate()

df_spark = spark.createDataFrame(df)
 
df_spark.show()
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/02 18:28:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 0:>                                                          (0 + 1) / 1]
+--------------------+--------------------+--------------------+----------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-------+-----------+-------------+---------+--------+--------+-------------+
|              Artist|               Track|               Album|Album_type|Danceability|Energy| Key|Loudness|Speechiness|Acousticness|Instrumentalness|Liveness|Valence|  Tempo|Duration_ms|        Views|    Likes|Comments|Licensed|       Stream|
+--------------------+--------------------+--------------------+----------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-------+-----------+-------------+---------+--------+--------+-------------+
|            Gorillaz|      Feel Good Inc.|          Demon Days|     album|       0.818| 0.705| 6.0|  -6.679|      0.177|     0.00836|         0.00233|   0.613|  0.772|138.559|   222640.0| 6.93555221E8|6220896.0|169907.0|    true|1.040234854E9|
|            Gorillaz|     Rhinestone Eyes|       Plastic Beach|     album|       0.676| 0.703| 8.0|  -5.815|     0.0302|      0.0869|         6.87E-4|  0.0463|  0.852| 92.761|   200173.0|  7.2011645E7|1079128.0| 31003.0|    true| 3.10083733E8|
|            Gorillaz|New Gold (feat. T...|New Gold (feat. T...|    single|       0.695| 0.923| 1.0|   -3.93|     0.0522|      0.0425|          0.0469|   0.116|  0.551|108.014|   215150.0|    8435055.0| 282142.0|  7399.0|    true|  6.3063467E7|
|            Gorillaz|  On Melancholy Hill|       Plastic Beach|     album|       0.689| 0.739| 2.0|   -5.81|      0.026|     1.51E-5|           0.509|   0.064|  0.578|120.423|   233867.0| 2.11754952E8|1788577.0| 55229.0|    true| 4.34663559E8|
|            Gorillaz|      Clint Eastwood|            Gorillaz|     album|       0.663| 0.694|10.0|  -8.627|      0.171|      0.0253|             0.0|  0.0698|  0.525|167.953|   340920.0| 6.18480958E8|6197318.0|155930.0|    true| 6.17259738E8|
|            Gorillaz|                DARE|          Demon Days|     album|        0.76| 0.891|11.0|  -5.852|     0.0372|      0.0229|          0.0869|   0.298|  0.966|120.264|   245000.0| 2.59021161E8|1844658.0| 72008.0|    true| 3.23850327E8|
|            Gorillaz|New Gold (feat. T...|New Gold (feat. T...|    single|       0.716| 0.897| 4.0|  -7.185|     0.0629|       0.012|           0.262|   0.325|  0.358| 127.03|   274142.0|     451996.0|  11686.0|   241.0|   false|  1.0666154E7|
|            Gorillaz|She's My Collar (...|     Humanz (Deluxe)|     album|       0.726| 0.815|11.0|  -5.886|     0.0313|     0.00799|           0.081|   0.112|  0.462|140.158|   209560.0|    1010982.0|  17675.0|   260.0|   false| 1.59605929E8|
|            Gorillaz|Cracker Island (f...|Cracker Island (f...|    single|       0.741| 0.913| 2.0|   -3.34|     0.0465|     0.00343|           0.103|   0.325|  0.643|120.012|   213750.0|   2.445982E7| 739527.0| 20296.0|    true|  4.2671901E7|
|            Gorillaz|         Dirty Harry|          Demon Days|     album|       0.625| 0.877|10.0|  -7.176|      0.162|      0.0315|          0.0811|   0.672|  0.865|192.296|   230426.0| 1.54761056E8|1386920.0| 39240.0|    true| 1.91074713E8|
|Red Hot Chili Pep...|     Californication|Californication (...|     album|       0.592| 0.767| 9.0|  -2.788|      0.027|      0.0021|         0.00165|   0.127|  0.328| 96.483|   329733.0|1.018811259E9|4394471.0|121452.0|    true|1.055738398E9|
|Red Hot Chili Pep...|    Under the Bridge|Blood Sugar Sex M...|     album|       0.559| 0.345| 4.0| -13.496|     0.0459|      0.0576|         1.05E-4|   0.141|  0.458| 84.581|   264307.0| 2.46687714E8|1213572.0| 32761.0|    true|1.061750522E9|
|Red Hot Chili Pep...|          Can't Stop|By the Way (Delux...|     album|       0.618| 0.938| 9.0|  -3.442|     0.0456|      0.0179|             0.0|   0.167|  0.875| 91.455|   269000.0| 3.36635759E8|1740224.0| 32573.0|    true| 8.66464951E8|
|Red Hot Chili Pep...|         Scar Tissue|Californication (...|     album|       0.595| 0.717| 0.0|  -4.803|     0.0295|      0.0779|         0.00274|   0.108|  0.547| 88.969|   215907.0|  4.3512153E8|1890900.0| 37069.0|    true| 6.13838674E8|
|Red Hot Chili Pep...|           Otherside|Californication (...|     album|       0.458| 0.795| 0.0|  -3.265|     0.0574|     0.00316|         2.02E-4|  0.0756|  0.513|123.229|   255373.0| 6.73528656E8|3140356.0| 60091.0|    true| 7.32774515E8|
|Red Hot Chili Pep...|       Snow (Hey Oh)|    Stadium Arcadium|     album|       0.427|   0.9|11.0|  -3.674|     0.0499|       0.116|         1.75E-5|   0.119|  0.599|104.655|   334667.0| 3.20871237E8|1272266.0| 37004.0|    true| 8.60722316E8|
|Red Hot Chili Pep...|     Dani California|    Stadium Arcadium|     album|       0.556| 0.913| 0.0|   -2.36|     0.0437|      0.0193|         8.59E-6|   0.346|   0.73| 96.184|   282160.0| 3.24228662E8|1456622.0| 49461.0|    true| 5.50067391E8|
|Red Hot Chili Pep...|          By the Way|By the Way (Delux...|     album|       0.451|  0.97| 0.0|  -4.938|      0.107|      0.0264|         0.00355|   0.102|  0.198|122.444|   216933.0| 1.79005296E8| 784717.0| 20084.0|    true| 3.67485508E8|
|Red Hot Chili Pep...|        Give It Away|Blood Sugar Sex M...|     album|       0.666| 0.936| 7.0|  -9.919|     0.0476|     0.00244|           0.086|   0.153|  0.776| 91.577|   282907.0|  8.6637926E7| 434837.0| 16029.0|    true| 3.01947159E8|
|Red Hot Chili Pep...|    Dark Necessities|         The Getaway|     album|         0.7| 0.742| 5.0|  -6.777|     0.0716|      0.0722|          0.0199|    0.11|  0.197| 91.959|   302000.0| 4.40037964E8|2094182.0| 56516.0|    true| 3.85677873E8|
+--------------------+--------------------+--------------------+----------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-------+-----------+-------------+---------+--------+--------+-------------+
only showing top 20 rows

                                                                                
In [18]:
# Saving the top 600 tracks for later

df_spark.createOrReplaceTempView("table0")

df600_spot = spark.sql('''
SELECT *
FROM table0 
ORDER BY Stream DESC
LIMIT 600''').toPandas()

df600_spot
Out[18]:
Artist Track Album Album_type Danceability Energy Key Loudness Speechiness Acousticness Instrumentalness Liveness Valence Tempo Duration_ms Views Likes Comments Licensed Stream
0 The Weeknd Blinding Lights After Hours album 0.514 0.730 1.0 -5.934 0.0598 0.00146 0.000095 0.0897 0.334 171.005 200040.0 6.741645e+08 8817927.0 282589.0 True 3.386520e+09
1 Ed Sheeran Shape of You ÷ (Deluxe) album 0.825 0.652 1.0 -3.183 0.0802 0.58100 0.000000 0.0931 0.931 95.977 233713.0 5.908398e+09 31047780.0 1130327.0 True 3.362005e+09
2 Lewis Capaldi Someone You Loved Divinely Uninspired To A Hellish Extent album 0.501 0.405 1.0 -5.679 0.0319 0.75100 0.000000 0.1050 0.446 109.891 182161.0 5.867684e+08 7367091.0 147565.0 True 2.634013e+09
3 Post Malone rockstar (feat. 21 Savage) beerbongs & bentleys album 0.585 0.520 5.0 -6.136 0.0712 0.12400 0.000070 0.1310 0.129 159.801 218147.0 1.060220e+09 12564657.0 366520.0 True 2.594927e+09
4 Swae Lee Sunflower - Spider-Man: Into the Spider-Verse Hollywood's Bleeding album 0.755 0.522 2.0 -4.368 0.0575 0.53300 0.000000 0.0685 0.925 89.960 157560.0 1.977389e+09 13749806.0 331064.0 True 2.538330e+09
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
595 Migos Walk It Talk It Culture II album 0.907 0.633 2.0 -5.145 0.1840 0.08760 0.000003 0.1060 0.395 145.914 276147.0 4.020519e+08 3589336.0 111565.0 True 6.754389e+08
596 Tove Lo Habits (Stay High) Queen Of The Clouds album 0.729 0.650 5.0 -3.539 0.0313 0.07020 0.000067 0.0829 0.347 110.020 209160.0 1.012196e+09 5376201.0 107220.0 True 6.749586e+08
597 A$AP Rocky F**kin' Problems (feat. Drake, 2 Chainz & Kend... LONG.LIVE.A$AP (Deluxe Version) album 0.853 0.693 1.0 -6.870 0.2750 0.02390 0.000000 0.1100 0.662 95.967 233787.0 2.962508e+08 1955320.0 61567.0 True 6.736265e+08
598 Billie Eilish Bored Bored single 0.614 0.318 7.0 -12.695 0.0478 0.89600 0.002390 0.0795 0.112 119.959 180933.0 5.467207e+07 1089600.0 18839.0 True 6.728659e+08
599 Big Sean I Don't Fuck With You Dark Sky Paradise album 0.824 0.733 1.0 -5.474 0.0613 0.03620 0.000000 0.3250 0.395 97.972 284387.0 3.823034e+08 3293569.0 102947.0 True 6.715107e+08

600 rows × 20 columns

In [19]:
df600_YT = spark.sql('''
SELECT *
FROM table0 
ORDER BY Views DESC
LIMIT 600''').toPandas()

df600_YT
Out[19]:
Artist Track Album Album_type Danceability Energy Key Loudness Speechiness Acousticness Instrumentalness Liveness Valence Tempo Duration_ms Views Likes Comments Licensed Stream
0 Daddy Yankee Despacito VIDA album 0.655 0.797 2.0 -4.787 0.1530 0.19800 0.000000 0.0670 0.839 177.928 229360.0 8.079647e+09 50788626.0 4252791.0 True 1.506598e+09
1 Ed Sheeran Shape of You ÷ (Deluxe) album 0.825 0.652 1.0 -3.183 0.0802 0.58100 0.000000 0.0931 0.931 95.977 233713.0 5.908398e+09 31047780.0 1130327.0 True 3.362005e+09
2 Wiz Khalifa See You Again (feat. Charlie Puth) See You Again (feat. Charlie Puth) single 0.689 0.481 10.0 -7.503 0.0815 0.36900 0.000001 0.0649 0.283 80.025 229526.0 5.773797e+09 40147618.0 2127345.0 True 1.521255e+09
3 CoComelon Wheels on the Bus CoComelon Kids Hits, Vol. 1 album 0.941 0.387 9.0 -11.920 0.0427 0.18400 0.000029 0.1570 0.965 125.021 207340.0 4.898831e+09 14396841.0 0.0 True 8.343436e+07
4 Mark Ronson Uptown Funk (feat. Bruno Mars) Uptown Special album 0.856 0.609 0.0 -7.223 0.0824 0.00801 0.000082 0.0344 0.928 114.988 269667.0 4.821016e+09 20067879.0 598916.0 True 1.653820e+09
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
595 Nelly Furtado Promiscuous - Radio Edit Promiscuous (Radio Edit) single 0.786 0.979 10.0 -4.489 0.0440 0.09160 0.000021 0.3490 0.811 114.302 220867.0 5.161938e+08 2735269.0 76338.0 True 1.012045e+07
596 CoComelon Finger Family CoComelon Kids Hits, Vol. 2 album 0.816 0.565 9.0 -11.555 0.0360 0.35200 0.000014 0.0768 0.965 119.983 136003.0 5.160842e+08 1714627.0 0.0 True 2.684379e+07
597 Rick Ross Purple Lamborghini (with Rick Ross) Purple Lamborghini (with Rick Ross) single 0.457 0.899 10.0 -2.591 0.1320 0.00190 0.000000 0.8310 0.298 147.268 215510.0 5.160529e+08 4441252.0 147715.0 True 3.736007e+08
598 Prince Royce Bubalu Bubalu compilation 0.800 0.355 6.0 -10.470 0.0895 0.61600 0.000000 0.1180 0.253 145.929 228493.0 5.145548e+08 3010847.0 75208.0 True 3.349868e+08
599 Beyoncé Love On Top 4 album 0.652 0.749 0.0 -5.248 0.0886 0.08480 0.000000 0.6040 0.651 94.103 267413.0 5.145325e+08 2260106.0 93092.0 True 5.248471e+08

600 rows × 20 columns

Top Performers¶

Essentially view count signifies YouTube and stream count signifies Spotify¶

Album type classification¶

In [20]:
df_spark.createOrReplaceTempView("table1")

sqlDF = spark.sql('''
SELECT Album_type, 
ROUND((COUNT(Album_type)*1.0/(SELECT COUNT(Album_type) FROM table1)*100),2) AS perc 
FROM table1 
GROUP BY Album_type''').toPandas()

irises_colors = ['rgb(33, 75, 99)', 'rgb(79, 129, 102)', 'rgb(151, 179, 100)']
fig = go.Figure(data=[go.Pie(labels=sqlDF['Album_type'], values=sqlDF['perc'], hole=.5, pull=[0.02, 0.02, 0.02], marker_colors=irises_colors)])
fig.show()

Top artists on YouTube¶

In [21]:
df_spark.createOrReplaceTempView("table2")

sqlDF = spark.sql('''SELECT Artist, 
AVG(Danceability) AS Danceability, AVG(Energy) AS Energy, AVG(Key) AS Key, 
AVG(Loudness) AS Loudness, AVG(Speechiness) AS Speechiness, 
AVG(Acousticness) AS Acousticness, AVG(Instrumentalness) AS Instrumentalness, AVG(Liveness) AS Liveness,
AVG(Valence) AS Valence, AVG(Tempo) AS Tempo, AVG(Duration_ms) AS Duration_ms,
AVG(Views) AS Views, AVG(Likes) AS Likes, AVG(Comments) AS Comments, AVG(Stream) AS Stream
FROM table2 
GROUP BY Artist 
ORDER BY Views DESC 
LIMIT 10;''').toPandas()

sqlDF
23/05/02 18:28:46 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                
Out[21]:
Artist Danceability Energy Key Loudness Speechiness Acousticness Instrumentalness Liveness Valence Tempo Duration_ms Views Likes Comments Stream
0 Katy Perry 0.655000 0.753000 5.250000 -4.822000 0.053737 0.039957 5.623750e-06 0.245125 0.565875 125.000500 221773.750000 1.552183e+09 7.298340e+06 372232.750000 6.807561e+08
1 Ed Sheeran 0.740375 0.657750 4.375000 -4.809000 0.062950 0.289125 2.311250e-05 0.190438 0.661375 106.736375 216951.875000 1.354188e+09 7.233060e+06 233036.250000 1.251813e+09
2 Dua Lipa 0.773250 0.715500 8.250000 -5.076000 0.076700 0.016010 3.900000e-06 0.099475 0.678000 110.007000 203405.250000 1.232530e+09 8.984291e+06 197210.250000 1.735381e+09
3 CoComelon 0.766857 0.401857 4.428571 -11.320429 0.040029 0.494857 1.900429e-05 0.096957 0.858143 146.948000 144556.428571 1.202930e+09 3.713634e+06 0.000000 3.756832e+07
4 The Weeknd 0.584167 0.637500 1.666667 -6.269333 0.085883 0.085293 1.902500e-05 0.224283 0.380667 142.643500 229499.833333 1.122235e+09 7.641168e+06 235070.666667 1.858146e+09
5 Wiz Khalifa 0.724667 0.651167 7.666667 -5.909333 0.071083 0.207433 1.716667e-07 0.180300 0.541333 136.015000 224813.000000 1.087298e+09 7.409934e+06 393581.666667 4.948412e+08
6 Daddy Yankee 0.772400 0.821200 3.900000 -4.783400 0.069220 0.133920 1.970000e-05 0.138710 0.682200 103.704700 206795.900000 1.087193e+09 6.551817e+06 467212.800000 5.135462e+08
7 Enrique Iglesias 0.671250 0.733125 5.375000 -4.887000 0.060625 0.222550 5.102250e-05 0.114375 0.634500 113.234125 231301.750000 1.040749e+09 3.578378e+06 99921.125000 3.355495e+08
8 DJ Snake 0.629667 0.798500 6.333333 -4.531000 0.103133 0.035372 1.370779e-01 0.192700 0.295167 124.661000 207896.833333 1.031645e+09 6.536074e+06 184021.333333 7.663337e+08
9 Macklemore & Ryan Lewis 0.612000 0.568000 4.000000 -8.555000 0.074300 0.523000 0.000000e+00 0.217000 0.230000 116.247000 346164.000000 1.012206e+09 6.604141e+06 242519.000000 5.662208e+07
In [22]:
metrics = ['Views', 'Likes', 'Comments', 'Stream', 'Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence','Tempo', 'Duration_ms']

num_metrics = len(metrics)
cols = 2
rows = (num_metrics + cols - 1) // cols

fig = sp.make_subplots(rows=rows, cols=cols, subplot_titles=metrics, vertical_spacing=0.03, horizontal_spacing=0.035)

for i, metric in enumerate(metrics):
    row = i // cols + 1
    col = i % cols + 1
    trace = go.Bar(x=sqlDF[sqlDF.columns[0]], y=sqlDF[metric], name=metric,
                   text=sqlDF[metric], textposition='outside', texttemplate='%{text:.2f}',
                   textfont=dict(size=8))
    fig.add_trace(trace, row=row, col=col)

fig.update_layout( title_text='Average Metrics by Top Artists', title_x=0.5, font=dict(size=9),
                  height=400 * rows, width=500 * cols, showlegend=False)

for i in fig['layout']['annotations']:
    i['font'] = dict(size=14)

fig.show()

Top artists on Spotify¶

In [23]:
df_spark.createOrReplaceTempView("table3")

sqlDF = spark.sql('''SELECT Artist, 
AVG(Danceability) AS Danceability, AVG(Energy) AS Energy, AVG(Key) AS Key, 
AVG(Loudness) AS Loudness, AVG(Speechiness) AS Speechiness, 
AVG(Acousticness) AS Acousticness, AVG(Instrumentalness) AS Instrumentalness, AVG(Liveness) AS Liveness,
AVG(Valence) AS Valence, AVG(Tempo) AS Tempo, AVG(Duration_ms) AS Duration_ms,
AVG(Views) AS Views, AVG(Likes) AS Likes, AVG(Comments) AS Comments, AVG(Stream) AS Stream
FROM table3
GROUP BY Artist 
ORDER BY Stream DESC 
LIMIT 10;''').toPandas()

sqlDF
                                                                                
Out[23]:
Artist Danceability Energy Key Loudness Speechiness Acousticness Instrumentalness Liveness Valence Tempo Duration_ms Views Likes Comments Stream
0 Drake 0.773000 0.537000 4.000000 -7.410000 0.081300 0.020480 0.000941 0.440500 0.363500 90.568000 186480.000000 8.185059e+08 8.453772e+06 306442.000000 2.312661e+09
1 The Weeknd 0.584167 0.637500 1.666667 -6.269333 0.085883 0.085293 0.000019 0.224283 0.380667 142.643500 229499.833333 1.122235e+09 7.641168e+06 235070.666667 1.858146e+09
2 Dua Lipa 0.773250 0.715500 8.250000 -5.076000 0.076700 0.016010 0.000004 0.099475 0.678000 110.007000 203405.250000 1.232530e+09 8.984291e+06 197210.250000 1.735381e+09
3 Post Malone 0.653571 0.622000 5.714286 -5.355714 0.075086 0.192029 0.000359 0.122471 0.366571 130.840286 200737.428571 4.079678e+08 4.284726e+06 108027.571429 1.513507e+09
4 XXXTENTACION 0.780250 0.524375 5.000000 -8.196125 0.151400 0.400712 0.001613 0.135825 0.430125 124.639125 138198.625000 3.038998e+08 5.270598e+06 373614.125000 1.368585e+09
5 Justin Bieber 0.614000 0.589571 2.428571 -6.922857 0.111143 0.411000 0.000004 0.242914 0.535429 123.849714 181000.000000 8.082079e+08 5.752261e+06 255620.285714 1.313070e+09
6 Shawn Mendes 0.679400 0.622600 6.600000 -7.199600 0.048980 0.221520 0.000029 0.086100 0.605320 121.408200 206256.000000 9.142391e+08 8.889465e+06 283394.600000 1.294543e+09
7 Ed Sheeran 0.740375 0.657750 4.375000 -4.809000 0.062950 0.289125 0.000023 0.190438 0.661375 106.736375 216951.875000 1.354188e+09 7.233060e+06 233036.250000 1.251813e+09
8 Khalid 0.645778 0.573667 3.111111 -7.611111 0.132578 0.309900 0.037364 0.121722 0.343667 111.576778 203433.333333 5.970037e+08 5.162533e+06 110469.333333 1.242641e+09
9 Coldplay 0.472700 0.570400 4.800000 -7.357100 0.030770 0.207006 0.005588 0.177930 0.290600 126.111200 261958.300000 9.997278e+08 6.315790e+06 212437.000000 1.177848e+09
In [24]:
metrics = ['Stream', 'Views', 'Likes', 'Comments', 'Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence','Tempo', 'Duration_ms']

fig = sp.make_subplots(rows=rows, cols=cols, subplot_titles=metrics, vertical_spacing=0.03, horizontal_spacing=0.035)

for i, metric in enumerate(metrics):
    row = i // cols + 1
    col = i % cols + 1
    trace = go.Bar(x=sqlDF[sqlDF.columns[0]], y=sqlDF[metric], name=metric,
                   text=sqlDF[metric], textposition='outside', texttemplate='%{text:.2f}',
                   textfont=dict(size=8))
    fig.add_trace(trace, row=row, col=col)

fig.update_layout( title_text='Average Metrics by Top Artists', title_x=0.5, font=dict(size=9),
                  height=400 * rows, width=500 * cols, showlegend=False)

for i in fig['layout']['annotations']:
    i['font'] = dict(size=14)

fig.show()

Top tracks on YouTube¶

In [25]:
df_spark.createOrReplaceTempView("table4")

sqlDF = spark.sql('''SELECT *
FROM table4
ORDER BY Views DESC 
LIMIT 10;''').toPandas()

sqlDF
Out[25]:
Artist Track Album Album_type Danceability Energy Key Loudness Speechiness Acousticness Instrumentalness Liveness Valence Tempo Duration_ms Views Likes Comments Licensed Stream
0 Daddy Yankee Despacito VIDA album 0.655 0.797 2.0 -4.787 0.1530 0.19800 0.000000 0.0670 0.839 177.928 229360.0 8.079647e+09 50788626.0 4252791.0 True 1.506598e+09
1 Ed Sheeran Shape of You ÷ (Deluxe) album 0.825 0.652 1.0 -3.183 0.0802 0.58100 0.000000 0.0931 0.931 95.977 233713.0 5.908398e+09 31047780.0 1130327.0 True 3.362005e+09
2 Wiz Khalifa See You Again (feat. Charlie Puth) See You Again (feat. Charlie Puth) single 0.689 0.481 10.0 -7.503 0.0815 0.36900 0.000001 0.0649 0.283 80.025 229526.0 5.773797e+09 40147618.0 2127345.0 True 1.521255e+09
3 CoComelon Wheels on the Bus CoComelon Kids Hits, Vol. 1 album 0.941 0.387 9.0 -11.920 0.0427 0.18400 0.000029 0.1570 0.965 125.021 207340.0 4.898831e+09 14396841.0 0.0 True 8.343436e+07
4 Mark Ronson Uptown Funk (feat. Bruno Mars) Uptown Special album 0.856 0.609 0.0 -7.223 0.0824 0.00801 0.000082 0.0344 0.928 114.988 269667.0 4.821016e+09 20067879.0 598916.0 True 1.653820e+09
5 PSY Gangnam Style (강남스타일) Gangnam Style (강남스타일) single 0.727 0.937 11.0 -2.871 0.2860 0.00417 0.000000 0.0910 0.749 132.067 219493.0 4.679767e+09 26399133.0 5331537.0 False 3.709911e+08
6 Katy Perry Roar PRISM album 0.671 0.771 7.0 -4.821 0.0316 0.00492 0.000007 0.3540 0.436 90.003 223546.0 3.725749e+09 15864499.0 763366.0 True 8.847210e+08
7 OneRepublic Counting Stars Native album 0.664 0.705 1.0 -4.972 0.0382 0.06540 0.000000 0.1150 0.477 122.017 257840.0 3.721610e+09 16558621.0 475191.0 True 1.805320e+09
8 Justin Bieber Sorry Purpose (Deluxe) album 0.654 0.760 0.0 -3.669 0.0450 0.07970 0.000000 0.2990 0.410 99.945 200787.0 3.627306e+09 15789307.0 865675.0 True 1.740759e+09
9 Ed Sheeran Thinking out Loud x (Wembley Edition) album 0.781 0.445 2.0 -6.061 0.0295 0.47400 0.000000 0.1840 0.591 78.998 281560.0 3.547156e+09 14343730.0 362545.0 True 2.154334e+09

Top tracks on Spotify¶

In [26]:
df_spark.createOrReplaceTempView("table5")

sqlDF = spark.sql('''SELECT *
FROM table5
ORDER BY Stream DESC 
LIMIT 10;''').toPandas()

sqlDF
Out[26]:
Artist Track Album Album_type Danceability Energy Key Loudness Speechiness Acousticness Instrumentalness Liveness Valence Tempo Duration_ms Views Likes Comments Licensed Stream
0 The Weeknd Blinding Lights After Hours album 0.514 0.730 1.0 -5.934 0.0598 0.00146 0.000095 0.0897 0.334 171.005 200040.0 6.741645e+08 8817927.0 282589.0 True 3.386520e+09
1 Ed Sheeran Shape of You ÷ (Deluxe) album 0.825 0.652 1.0 -3.183 0.0802 0.58100 0.000000 0.0931 0.931 95.977 233713.0 5.908398e+09 31047780.0 1130327.0 True 3.362005e+09
2 Lewis Capaldi Someone You Loved Divinely Uninspired To A Hellish Extent album 0.501 0.405 1.0 -5.679 0.0319 0.75100 0.000000 0.1050 0.446 109.891 182161.0 5.867684e+08 7367091.0 147565.0 True 2.634013e+09
3 Post Malone rockstar (feat. 21 Savage) beerbongs & bentleys album 0.585 0.520 5.0 -6.136 0.0712 0.12400 0.000070 0.1310 0.129 159.801 218147.0 1.060220e+09 12564657.0 366520.0 True 2.594927e+09
4 Swae Lee Sunflower - Spider-Man: Into the Spider-Verse Hollywood's Bleeding album 0.755 0.522 2.0 -4.368 0.0575 0.53300 0.000000 0.0685 0.925 89.960 157560.0 1.977389e+09 13749806.0 331064.0 True 2.538330e+09
5 Drake One Dance Views album 0.792 0.625 1.0 -5.609 0.0536 0.00776 0.001800 0.3290 0.370 103.967 173987.0 1.692883e+08 1662640.0 13775.0 False 2.522432e+09
6 Imagine Dragons Believer Evolve album 0.776 0.780 10.0 -4.374 0.1280 0.06220 0.000000 0.0810 0.666 124.949 204347.0 2.369715e+09 20483444.0 613230.0 True 2.369272e+09
7 Justin Bieber STAY (with Justin Bieber) F*CK LOVE 3+: OVER YOU album 0.591 0.764 1.0 -5.484 0.0483 0.03830 0.000000 0.1030 0.478 169.928 141806.0 6.812056e+08 10131328.0 247007.0 True 2.365778e+09
8 Shawn Mendes Señorita Shawn Mendes (Deluxe) album 0.759 0.548 9.0 -6.049 0.0290 0.03920 0.000000 0.0828 0.749 116.967 190800.0 1.487649e+09 19846114.0 640320.0 True 2.336220e+09
9 Glass Animals Heat Waves Dreamland (+ Bonus Levels) album 0.761 0.525 11.0 -6.900 0.0944 0.44000 0.000007 0.0921 0.531 80.870 238805.0 4.800890e+08 7145914.0 140345.0 True 2.261464e+09

How different metrics influence views or stream?¶

Let's assess how danceability, energy, speechiness, valence and tempo affect view/stream count.¶

In [27]:
fig = px.scatter(df, x=df['Danceability'], y=df['Views'], color=df['Energy'], hover_data=['Track'])
fig.show()

Danceability does have a positive correlation with number of expected views. Same is the case with Energy levels.¶

In [28]:
fig = px.scatter(df, x=df['Speechiness'], y=df['Views'], color = df['Energy'], hover_data=['Track'])
fig.show()

Musicality is celebrated more worldwide which is concluded by 'speechiness' factor of a hit track.¶

Here, speechiness detects the presence of spoken words in a track.¶

In [29]:
fig = px.scatter(df, x=df['Tempo'], y=df['Views'], hover_data=['Track'])
fig.show()

Tempo is one essential metric of general composition. However, it isn't the most proper deciding factor for a track's success.¶

As one can see, probability of a hit track is marginally higher when tempo range falls between 70 and 150.¶

In [30]:
fig = px.scatter(df, x=df['Valence'], y=df['Stream'], color=df['Energy'], hover_data=['Track'])
fig.show()

Not much can be said about valence. Valence essentially conveys musical positiveness.¶

In [31]:
fig = px.scatter(df, x=df['Danceability'], y=df['Tempo'],
                 color=df['Stream'], trendline = 'ols')
fig.show()

Although this plot does favor our findings about the tempo range but no clear conclusions can be formed for a track's success.¶

In [32]:
fig = px.scatter(df, x=df['Loudness'], y=df['Views'], color=df['Danceability'], hover_data=['Track'])
fig.show()

Loudness has the most positive correlation with number of views.¶

Some additional analytics¶

View-like ratio indicates Positive feedback & Audience engagement/loyalty¶

In [33]:
df['V/L_Ratio'] = df['Likes']/ df['Views']
df['V/L_Ratio'] = df['V/L_Ratio'].apply(lambda x: round(x, 4))
vl = df.sort_values(by = 'V/L_Ratio', ascending = False).head(20)[['Track', 'Artist', 'Views', 'Likes', 'V/L_Ratio']]

fig = px.bar(vl, x=vl['Track'], y=vl['V/L_Ratio'],
             hover_data=['Artist'])
fig.show()

Top Albums¶

In [34]:
df_spark.createOrReplaceTempView("table6")

sqlDF = spark.sql('''SELECT Album, Artist, SUM(Stream) AS Streams
FROM table6
GROUP BY Album, Artist
ORDER BY SUM(Stream) DESC 
LIMIT 15;''').toPandas()

sqlDF

fig = px.line(sqlDF, x=sqlDF['Album'], y=sqlDF['Streams'],
             hover_data=['Artist'], markers=True)
fig.show()
                                                                                

Most Liked Artists¶

In [35]:
df_spark.createOrReplaceTempView("table7")

sqlDF = spark.sql('''SELECT Artist, SUM(Likes) AS Total_Likes
FROM table6
GROUP BY Artist
ORDER BY SUM(Likes) DESC 
LIMIT 15;''').toPandas()

sqlDF

fig = px.bar(sqlDF, x=sqlDF['Artist'], y=sqlDF['Total_Likes'])
fig.show()

Prediction modelling using different machine learning algorithms¶

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
In [37]:
df
Out[37]:
Artist Track Album Album_type Danceability Energy Key Loudness Speechiness Acousticness ... Liveness Valence Tempo Duration_ms Views Likes Comments Licensed Stream V/L_Ratio
0 Gorillaz Feel Good Inc. Demon Days album 0.818 0.705 6.0 -6.679 0.1770 0.008360 ... 0.6130 0.7720 138.559 222640.0 693555221.0 6220896.0 169907.0 True 1.040235e+09 0.0090
1 Gorillaz Rhinestone Eyes Plastic Beach album 0.676 0.703 8.0 -5.815 0.0302 0.086900 ... 0.0463 0.8520 92.761 200173.0 72011645.0 1079128.0 31003.0 True 3.100837e+08 0.0150
2 Gorillaz New Gold (feat. Tame Impala and Bootie Brown) New Gold (feat. Tame Impala and Bootie Brown) single 0.695 0.923 1.0 -3.930 0.0522 0.042500 ... 0.1160 0.5510 108.014 215150.0 8435055.0 282142.0 7399.0 True 6.306347e+07 0.0334
3 Gorillaz On Melancholy Hill Plastic Beach album 0.689 0.739 2.0 -5.810 0.0260 0.000015 ... 0.0640 0.5780 120.423 233867.0 211754952.0 1788577.0 55229.0 True 4.346636e+08 0.0084
4 Gorillaz Clint Eastwood Gorillaz album 0.663 0.694 10.0 -8.627 0.1710 0.025300 ... 0.0698 0.5250 167.953 340920.0 618480958.0 6197318.0 155930.0 True 6.172597e+08 0.0100
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20713 SICK LEGEND JUST DANCE HARDSTYLE JUST DANCE HARDSTYLE single 0.582 0.926 5.0 -6.344 0.0328 0.448000 ... 0.0839 0.6580 90.002 94667.0 71678.0 1113.0 0.0 True 9.227144e+06 0.0155
20714 SICK LEGEND SET FIRE TO THE RAIN HARDSTYLE SET FIRE TO THE RAIN HARDSTYLE single 0.531 0.936 4.0 -1.786 0.1370 0.028000 ... 0.0923 0.6570 174.869 150857.0 164741.0 2019.0 0.0 True 1.089818e+07 0.0123
20715 SICK LEGEND OUTSIDE HARDSTYLE SPED UP OUTSIDE HARDSTYLE SPED UP single 0.443 0.830 4.0 -4.679 0.0647 0.024300 ... 0.1540 0.4190 168.388 136842.0 35646.0 329.0 0.0 True 6.226110e+06 0.0092
20716 SICK LEGEND ONLY GIRL HARDSTYLE ONLY GIRL HARDSTYLE single 0.417 0.767 9.0 -4.004 0.4190 0.356000 ... 0.1080 0.5390 155.378 108387.0 6533.0 88.0 0.0 True 6.873961e+06 0.0135
20717 SICK LEGEND MISS YOU HARDSTYLE MISS YOU HARDSTYLE single 0.498 0.938 6.0 -4.543 0.1070 0.002770 ... 0.1360 0.0787 160.067 181500.0 158697.0 2484.0 0.0 True 5.695584e+06 0.0157

16866 rows × 21 columns

In [38]:
df.columns
Out[38]:
Index(['Artist', 'Track', 'Album', 'Album_type', 'Danceability', 'Energy',
       'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Liveness', 'Valence', 'Tempo', 'Duration_ms', 'Views', 'Likes',
       'Comments', 'Licensed', 'Stream', 'V/L_Ratio'],
      dtype='object')

Linear Regression Model¶

In [39]:
# Drop unnecessary columns like track name, artist, album
# Scale the features using StandardScaler
# Split the dataset into training and test sets
X = df.drop(['Artist', 'Track', 'Album', 'Album_type', 'Likes', 'Comments', 'Licensed', 'Stream', 'Views', 'V/L_Ratio'], axis=1)
X = X.values
y = df['Views']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the linear regression model
reg = LinearRegression()
reg.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean squared error:', mse)
print('Mean absolute error:', mae)
print('R-squared:', r2)

# Making predictions on new data
new_data = [[0.7, 0.3, 5, -4, 0.2, 0.01, 0.004, 0.09, 0.5, 120, 233347.0], 
            [0.5, 0.25, 7, -5.7, 0.35, 0.06, 0.008, 0.1, 0.8, 78, 363262.0]]
new_preds = reg.predict(new_data)
print('Predictions:', new_preds)
Mean squared error: 6.269089705294543e+16
Mean absolute error: 109784142.56687756
R-squared: 0.020073031150982357
Predictions: [1.44411764e+08 1.24250442e+08]

The Linear Regression Model doesn't give us a good accurancy with R-squared being close to 0.¶

Random Forest Model¶

In [40]:
regRF = RandomForestRegressor(n_estimators=100, random_state=42)
regRF.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = regRF.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean squared error:', mse)
print('Mean absolute error:', mae)
print('R-squared:', r2)

# Make predictions on new data
new_data = [[0.7, 0.3, 5, -4, 0.2, 0.01, 0.004, 0.09, 0.5, 120, 233347.0], 
            [0.5, 0.25, 7, -5.7, 0.35, 0.06, 0.008, 0.1, 0.8, 78, 363262.0]]
new_preds = regRF.predict(new_data)
print('Predictions:', new_preds)
Mean squared error: 6.5106613912709176e+16
Mean absolute error: 118230420.04959916
R-squared: -0.01768725321672293
Predictions: [3.06437729e+08 3.27065231e+08]

Well, a negative R-squared Value always scare me.¶

Conclusion:¶

Clearly, the features of the dataset provide no correlation and any prediction is merely reliable.¶

Hence, our hypothesis is true and the dataset is fundamentally unfit for predictive analysis.¶